from IPython.display import display, HTML

import dateparser
import datetime
import handcalcs.render
import numpy as np
import pandas as pd
import plotly.io as pio
import plotly.graph_objects as go
import plotly.express as px
import pycountry
from pycountry_convert import country_alpha2_to_continent_code, country_alpha3_to_country_alpha2
### KK: add simple docstrings

# Clean up the dataset
def name_to_iso3(x):
    # fuzzy search does not like UK
    if x == "UK":
        x = "United Kingdom"
    try:
        iso3 = pycountry.countries.search_fuzzy(x)[0].alpha_3
    except:
        iso3 = ""
    return iso3

def alpha3_to_alpha2(x):
    try:
        alpha_2 = country_alpha3_to_country_alpha2(x)
    except:
        alpha_2 = ""
    return alpha_2

def alpha2_to_continent(x):
    try:
        continent = country_alpha2_to_continent_code(x)
    except:
        continent = ""
    return continent


def upper_string(lower_string):
    return lower_string.title()

def calc_age(x):
    return (datetime.datetime.now() - dateparser.parse(x, settings={'TIMEZONE': 'CEST'})).days/365

def count_strings(comma_seperated_string):
    if type(comma_seperated_string) == str:
        return comma_seperated_string.count(",")
    else:
        return 0

Set Default Plotting Options#

# default plotting options
# Palette https://coolors.co/palette/0e7c7b-17bebb-ffc857-e9724c-c5283d
height = (800,)  # Added parameter
color_continuous_scale = px.colors.sequential.Aggrnyl
marker_color = "#0E7C7B"
color_discrete_sequence = ["#0E7C7B", "#17BEBB", "#FFC857", "#E9724C", "#C5283D"]

# Register your theme as a named template
pio.templates["OpenSustain"] = go.layout.Template(
    layout=dict(
        font=dict(
            family="Google Font",
            color="#040404",
            size=15,
        ),
        title_font_family="Google Font",
        title_font_color="#040404",
        legend_title_font_color="#040404",
    ),
)

# Combine your theme with plotly's default
pio.templates.default = "plotly+OpenSustain"
df_raw = pd.read_csv("./csv/projects.csv")
df_raw.head(5)
project_name oneliner git_namespace git_url platform topics rubric last_commit_date stargazers_count number_of_dependents ... organization_name organization_github_url organization_website organization_location organization_country organization_form organization_avatar organization_public_repos organization_created organization_last_update
0 pvlib-python A set of documented functions for simulating t... pvlib https://github.com/pvlib/pvlib-python.git github solar-energy,python,renewable-energy,renewable... Photovoltaics and Solar Energy 2022/08/31, 04:48:09 728.0 257.0 ... NaN https://github.com/pvlib NaN NaN NaN NaN https://avatars.githubusercontent.com/u/110372... NaN NaN NaN
1 pvfactors Open source view-factor model for diffuse shad... SunPower https://github.com/SunPower/pvfactors.git github solar-energy,renewable-energy,python,bifacial Photovoltaics and Solar Energy 2022/02/22, 21:53:32 62.0 7.0 ... NaN https://github.com/SunPower NaN NaN NaN NaN https://avatars.githubusercontent.com/u/134197... NaN NaN NaN
2 gsee Global Solar Energy Estimator. renewables-ninja https://github.com/renewables-ninja/gsee.git github solar,pandas,energy,irradiance,photovoltaic,pv... Photovoltaics and Solar Energy 2020/07/21, 06:28:35 88.0 0.0 ... NaN https://github.com/renewables-ninja https://www.renewables.ninja/ NaN NaN NaN https://avatars.githubusercontent.com/u/118382... NaN NaN NaN
3 PVMismatch An explicit Python PV system IV & PV curve tra... SunPower https://github.com/SunPower/PVMismatch.git github numpy,scipy,python,solar,photovoltaic Photovoltaics and Solar Energy 2022/04/14, 19:15:36 51.0 0.0 ... NaN https://github.com/SunPower NaN NaN NaN NaN https://avatars.githubusercontent.com/u/134197... NaN NaN NaN
4 rdtools An open source library to support reproducible... NREL https://github.com/NREL/rdtools.git github NaN Photovoltaics and Solar Energy 2022/09/02, 16:21:58 109.0 5.0 ... NaN https://github.com/NREL http://www.nrel.gov Golden, CO NaN NaN https://avatars.githubusercontent.com/u/190680... NaN NaN NaN

5 rows × 51 columns

Calculate Age in Years#

## KK: I would suggest using a clearer object-naming convention. Below it becomes unclear what's the difference between df and df_raw
# Age plots are better in years
df_raw["project_age_in_years"] = df_raw["project_age_in_days"].apply(lambda x: x / 365)
max_age_in_years = 8.0

Basis Statistics#

First let us get a routh overview of the project dataset

fig = go.Figure(
    data=[
        go.Table(
            header=dict(values=["Dimension", "Value"],line_color='#000000',
                        fill_color='#ffffff', font_size=18 ,  ),
            cells=dict(
                        fill_color='#ffffff',
                        line_color='#ffffff',
                        font_size=16,
                        height=30,
                values=[
                    [
                        "Total number of projects",
                        "Github projects",
                        "Gitlab projects",
                        "Other platforms",
                        "Number of projects in personal namespace",
                        "Total stars of all projects",
                        "Total contributers of all projects",
                        "Active GitHub projects",
                        "Inactive GitHub projects",
                        "Projects with contribution guide in %",
                        "Projects with code of conduct in %",
                        "Projects accepting donations in %",
                        "Median number of commits",
                        "Median stargazers",
                        "Median stars last year",
                        "Median Development Distribution Score",
                        "Median number of contributors",
                        "Median closed issues last year",
                        "Median commits last year",
                        "Median age in years",
                    ],
                    [
                        df_raw["project_name"].count(),
                        df_raw["platform"].value_counts()["github"],
                        df_raw["platform"].value_counts()["gitlab"],
                        df_raw["platform"].value_counts()["custom"],
                        df_raw["project_name"].count() - df_raw["organization"].count(),
                        df_raw["stargazers_count"].sum(),
                        df_raw["contributors"].sum(),
                        df_raw["project_active"].value_counts()[True],
                        df_raw["project_active"].value_counts()[False],
                        round(df_raw["contribution_guide"].value_counts(normalize=True)[True]*100,2),
                        round(df_raw["code_of_conduct"].value_counts(normalize=True)[True]*100,2),
                        round(df_raw["accepts_donations"].value_counts(normalize=True)[True]*100,2),
                        df_raw["total_number_of_commits"].median(),
                        df_raw["stargazers_count"].median(),
                        df_raw["stars_last_year"].median(),
                        round(df_raw["development_distribution_score"].median(),4),
                        df_raw["contributors"].median(),
                        df_raw["issues_closed_last_year"].median(),
                        df_raw["total_commits_last_year"].median(),
                        round(df_raw["project_age_in_years"].median(),2),
                        
                    ],
                ]
            ),
        )
    ]
)



fig.update_layout(
height=1000,
width=1000
)
fig.show()

Development Distribution Score#

The Development Distribution Score (DDS) weights how the development is distributed between projects contributors by setting contributor with the most commits in relation with the other contributors. Distribution of knowledge, work, and governance of an project ensure sustainability. When people are leaving a project or don’t find time anymore for an open source project other can still continue and jump into leading positions.

DDS is created in the preprocessing script and is similar to the bus factor. It is only based on quantiative values derived from git statistics. This value is calculated in preprocessing.

Filter Data#

df_active = df_raw.copy()
# Filter out the inactive project for further analysis
df_active = df_active[(df_active["project_active"] == True)]
# Ciruated Lists are no classical open source projects and are not included into the analysis
df_active = df_active[(df_active["rubric"] != "Curated Lists")]
# Filter out the projects not on the GitHub platform
df_active = df_active[(df_active["platform"] == "github")]

Score Projects#

# Calculate the scores on activity, community and size
df_active["activity"] = (
    df_active["total_commits_last_year"].rank(pct=True)
    + df_active["issues_closed_last_year"].rank(pct=True)
    + df_active["days_until_last_issue_closed"].rank(pct=True)
    + df_active["last_released_date"].rank(pct=True, na_option="top")
)

df_active["community"] = (
    df_active["contributors"].rank(pct=True)
    + df_active["development_distribution_score"].rank(pct=True)
    + df_active["reviews_per_pr"].rank(pct=True)
)

df_active["size"] = (
    df_active["total_number_of_commits"].rank(pct=True)
    + df_active["contributors"].rank(pct=True)
    + df_active["closed_issues"].rank(pct=True)
    + df_active["closed_pullrequests"].rank(pct=True)
)

# All scores are weighted equal and normalized to one
df_active["total_score"] = (
    df_active["activity"] / df_active["activity"].max()
    + df_active["community"] / df_active["community"].max()
    + df_active["size"] / df_active["size"].max()
) / 3
# Save the dataset with the scores
df_active_path = "./csv/project_analysis.csv"
df_active.to_csv(df_active_path)
%%render
## The calcluation within this cell shall reader give an understanding on how the DDS is been calculated. 
## Values calculated here are not used in any other cell.
n_MaxCommitsSingleContributor = 90
n_total_commits = 100


DDS = 1 - n_MaxCommitsSingleContributor / n_total_commits
\[\begin{split} \begin{aligned} & \textrm{ The calcluation within this cell shall reader give an understanding on how the DDS is been calculated. }\\[10pt] & \textrm{ Values calculated here are not used in any other cell.}\\[10pt] n_{MaxCommitsSingleContributor} &= 90 \; \\[10pt] n_{total_{commits}} &= 100 \; \\[10pt] \mathrm{DDS} &= 1 - \frac{ n_{MaxCommitsSingleContributor} }{ n_{total_{commits}} } = 1 - \frac{ 90 }{ 100 } &= 0.100 \end{aligned} \end{split}\]
### KK: this is where a clear object naming convention + comments would really help: is syntax df[df_raw[..]] appropriate here? 
### KK: it might be helpful to plot boxplots for the below scores per category to better show their distribution, including median

df_personal_projects = df_active[df_active["organization"].isna()]
df_organization_projects = df_active[df_active["organization"].notna()]
df_inactive = df_raw[(df_raw["project_active"] == False)]
df_top_stargazers = df_active[(df_active["stargazers_count"] > 100)]

fig = go.Figure(
    data=[
        go.Table(
            header=dict(values=["Median DDS", "Value"],line_color='#000000',fill_color='#ffffff',font_size=18),
            cells=dict(
                        line_color='#ffffff',fill_color='#ffffff', font_size=16, height =30,
                values=[
                    [
                        "All projects",
                        "Active projects in personal namespace",
                        "Active organization projects",
                        "Active projects",
                        "Inactive projects",
                        "Active projects with more than 50 Stars",

                    ],
                    [
                        round(df_raw["development_distribution_score"].median(),3),
                        round(df_personal_projects["development_distribution_score"].median(),3),
                        round(df_organization_projects["development_distribution_score"].median(),3),
                        round(df_active["development_distribution_score"].median(),3),
                        round(df_inactive["development_distribution_score"].median(),3),
                        round(df_top_stargazers["development_distribution_score"].median(),3),
                    ],
                ]
            ),
        )
    ]
)

fig.update_layout(
width=800

)

fig.show()
df_active.iloc[300]
project_name                                                                   EVCC
oneliner                          An extensible EV Charge Controller with PV int...
git_namespace                                                                 andig
git_url                                         https://github.com/evcc-io/evcc.git
platform                                                                     github
topics                            mqtt,golang,pv,wallbox,emobility,charger,wallb...
rubric                                                  Mobility and Transportation
last_commit_date                                               2022/09/04, 20:03:13
stargazers_count                                                              759.0
number_of_dependents                                                           23.0
stars_last_year                                                               492.0
project_active                                                                 True
dominating_language                                                              Go
organization                                                                    NaN
organization_user_name                                                      evcc-io
languages                         Go,Vue,JavaScript,Smarty,CSS,Shell,Makefile,Do...
homepage                                                            https://evcc.io
refs                                                                            NaN
project_created                                                2019/12/06, 16:27:04
project_age_in_days                                                          1003.0
license                                                                         MIT
total_commits_last_year                                                      1096.0
total_number_of_commits                                                      2190.0
last_issue_closed                                              2022/09/04, 21:01:01
open_issues                                                                    39.0
closed_pullrequests                                                          1394.0
closed_issues                                                                2266.0
issues_closed_last_year                                                      1257.0
days_until_last_issue_closed                                                    0.0
open_pullrequests                                                              23.0
reviews_per_pr                                                                  1.1
development_distribution_score                                             0.222484
last_released_date                                             2022/08/13, 11:31:37
last_release_tag_name                                                         0.100
good_first_issue                                                                0.0
contributors                                                                   57.0
accepts_donations                                                              True
donation_platforms                github,patreon,open_collective,ko_fi,tidelift,...
code_of_conduct                                                               False
contribution_guide                                                            False
dependents_repos                  JanDragon/evcc,opensprinklershop/evcc,matspi/e...
organization_name                                                               NaN
organization_github_url                                  https://github.com/evcc-io
organization_website                                                https://evcc.io
organization_location                                                       Germany
organization_country                                                            NaN
organization_form                                                               NaN
organization_avatar               https://avatars.githubusercontent.com/u/813835...
organization_public_repos                                                       NaN
organization_created                                                            NaN
organization_last_update                                                        NaN
project_age_in_years                                                       2.747945
activity                                                                   2.871102
community                                                                  2.193867
size                                                                       3.682952
total_score                                                                0.856184
Name: 406, dtype: object

Process Active GitHub Projects#

# Read the scored dataset and configure the plotting backend
df_active = pd.read_csv(df_active_path)

Start Plotting#

license_his = (
    df_active["license"]
    .value_counts()
    .to_frame()
    .rename_axis("license_names")
    .reset_index()
)
fig = px.pie(license_his, values="license", names="license_names", color_discrete_sequence=color_discrete_sequence, hole=0.2)

fig.update_layout(title="Distribution of Licenses", showlegend=True, font_size=16)
fig.update_traces(textposition='inside', textinfo='percent+label', marker=dict(line=dict(color='#000000', width=1)))
fig.show()
fig = px.histogram(
    df_active,
    x="project_age_in_years",
    nbins=50,
    title="Distribution of Project Age in Years",
)
fig.update_layout(
    yaxis_title="Projects",
    xaxis_title="Project Age",
)
fig.update_traces(marker_color=marker_color)
fig.show()
fig = px.histogram(
    df_active,
    x="total_number_of_commits",
    nbins=50,
    title="Distribution of Total Commits",
)
fig.update_layout(
    yaxis_title="Projects",
    xaxis_title="Project Total Commits",
)
fig.update_traces(marker_color=marker_color)
fig.show()
rubric_his = (
    df_active["rubric"]
    .value_counts()
    .to_frame()
    .rename_axis("rubric_names")
    .reset_index()
)
fig = px.pie(rubric_his, values="rubric", names="rubric_names", color_discrete_sequence=color_discrete_sequence, hole=0.2)

fig.update_layout(title="Projects within Rubrics", height=1200, showlegend=False)
fig.update_traces(textposition='outside', textinfo='value+label', marker=dict(line=dict(color='#000000', width=2)))
fig.show()
fig = px.pie(df_active.groupby('rubric')['contributors'].sum().reset_index(), values="contributors", names="rubric", color_discrete_sequence=color_discrete_sequence, hole=0.2)

fig.update_layout(title="Contributors within Rubrics", height=1200, showlegend=False)
fig.update_traces(textposition='outside', textinfo='value+label', marker=dict(line=dict(color='#000000', width=2)))
fig.show()
fig = px.pie(df_active.groupby('rubric')['stargazers_count'].sum().reset_index(), values="stargazers_count", names="rubric", color_discrete_sequence=color_discrete_sequence, hole=0.2)

fig.update_layout(title="Stars within Rubrics", height=1000, showlegend=False)
fig.update_traces(textposition='outside', textinfo='value+label', marker=dict(line=dict(color='#000000', width=2)))
fig.show()
fig = px.pie(df_active.groupby('rubric')['development_distribution_score'].median().reset_index(), values="development_distribution_score", names="rubric", color_discrete_sequence=color_discrete_sequence, hole=0.2)

fig.update_layout(title="Median Development Distribution Score within Rubrics", height=1000, showlegend=False)
fig.update_traces(textposition='outside', textinfo='value+label', marker=dict(line=dict(color='#000000', width=2)))
fig.show()
fig = px.pie(df_active.groupby('rubric')['stars_last_year'].sum().reset_index(), values="stars_last_year", names="rubric", color_discrete_sequence=color_discrete_sequence, hole=0.2)

fig.update_layout(title="Stars within Rubrics", height=1200, showlegend=False)
fig.update_traces(textposition='outside', textinfo='value+label', marker=dict(line=dict(color='#000000', width=2)))
fig.show()
license_dominating_language = (
    df_active["dominating_language"]
    .value_counts()
    .to_frame()
    .rename_axis("dominating_language_names")
    .reset_index()
)
license_dominating_language
license_dominating_language = license_dominating_language[(license_dominating_language["dominating_language"] > 4)]
fig = px.pie(license_dominating_language, values="dominating_language", names="dominating_language_names", color_discrete_sequence=color_discrete_sequence, hole=0.2)

fig.update_layout(title="Distribution of Programming Languages", showlegend=True, font_size=16,height=800)
fig.update_traces(textposition='outside', textinfo='percent+label', marker=dict(line=dict(color='#000000', width=1)))
fig.show()
# df_sorted = df.groupby(['rubric'], as_index=False)['dominating_language'].agg('sum')
df_language_distribution = (
    df_active.value_counts(["rubric", "dominating_language"]).to_frame().reset_index()
)

df_language_distribution.rename(columns={0: "counts"}, inplace=True)
fig = px.scatter(
    df_language_distribution, x="dominating_language", y="rubric", size="counts", 
)


fig.update_layout(
    height=1000,  # Added parameter
    xaxis_title="Dominating Language",
    yaxis_title="Rubric",
)
fig.update_traces(marker_color=marker_color)


fig.show()
# df_sorted = df.groupby(['rubric'], as_index=False)['dominating_language'].agg('sum')
df_license_distribution = (
    df_active.value_counts(["rubric", "license"]).to_frame().reset_index()
)

df_license_distribution.rename(columns={0: "counts"}, inplace=True)
fig = px.scatter(df_license_distribution, x="license", y="rubric", size="counts")


fig.update_layout(
    height=1000,  # Added parameter
    xaxis_title="License",
    yaxis_title="Rubric",
    title="License Distribution over Rubric",
    autosize=True,
)
fig.update_traces(marker_color=marker_color)


fig.show()
fig = px.histogram(
    df_active,
    x="contributors",
    nbins=100,
    title=" Contributors",
)
fig.update_layout(
    yaxis_title="Projects",
    xaxis_title="Contributors",
)
fig.update_traces(marker_color=marker_color)
fig.show()
most_listed_projects = df_active["git_namespace"].value_counts(ascending=False).to_frame().rename_axis("Namespace").reset_index().rename(columns={"git_namespace": "counts"})
fig = go.Figure(data=[go.Table(
    header=dict(values=list(most_listed_projects.columns), line_color='#000000', fill_color='#ffffff',font_size=18 ),
    cells=dict(line_color='#ffffff', fill_color='#ffffff', font_size=16, height=30, values=[most_listed_projects.Namespace, most_listed_projects.counts])
)])

fig.update_layout(
autosize=False,
)

fig.show()
oldest_projects = df_active.nlargest(40, "project_age_in_years")

fig = px.bar(
    x=oldest_projects["project_age_in_years"],
    y=oldest_projects["project_name"],
    orientation="h",
    range_x=(9.6, 14),
    hover_name=oldest_projects["git_url"],
    color=oldest_projects["development_distribution_score"],
    color_continuous_scale=color_continuous_scale
)

fig.update_layout(
    height=1000,  # Added parameter
    yaxis_title="Rubric",
    xaxis_title="Project Age in Years",
    title="The oldest Projects still active",
    coloraxis_colorbar=dict(
    title="DDS",
    ),
)



fig.update(layout_showlegend=False)
contributors = df_active.nlargest(40, "contributors")

fig = px.bar(
    x=contributors["contributors"],
    y=contributors["project_name"],
    orientation="h",
    title="Projects with most contributors",
    hover_name=contributors["git_url"],
    color=contributors["development_distribution_score"],
    color_continuous_scale=color_continuous_scale
)

fig.update_layout(
    height=1200,  # Added parameter
    xaxis_title="Contributors",
    yaxis_title="Project",
    title="Projects with the most contributors",
    coloraxis_colorbar=dict(
    title="DDS",
    ),
)

fig.update(layout_showlegend=False)
top_stargazers = df_active.nlargest(40, "stargazers_count")

fig = px.bar(
    x=top_stargazers["stargazers_count"],
    y=top_stargazers["project_name"],
    orientation="h",
    hover_name=top_stargazers["git_url"],
    color=top_stargazers["development_distribution_score"],
    color_continuous_scale=color_continuous_scale

)

fig.update_layout(
    height=1000,  # Added parameter
    xaxis_title="Stars",
    yaxis_title="Project",
    title="Projects with the most Stars",
    coloraxis_colorbar=dict(
    title="DDS",
    ),
)

fig.update(layout_showlegend=False)
df_top_100_stargazers = df_active[(df_active["stargazers_count"]) > 100].copy()
df_top_100_stargazers["star_growth"] = (
    df_top_100_stargazers["stars_last_year"] / df_top_100_stargazers["stargazers_count"]
)

df_top_40_star_growth = df_top_100_stargazers.nlargest(40, "star_growth")
fig = px.bar(
    x=df_top_40_star_growth["star_growth"] * 100,
    y=df_top_40_star_growth["project_name"],
    orientation="h",
    hover_name=df_top_40_star_growth["git_url"],
    color=df_top_40_star_growth["development_distribution_score"],
    color_continuous_scale=color_continuous_scale
)

fig.update_layout(
    height=1000,  # Added parameter
    xaxis_title="Star Growth last Year [%]",
    yaxis_title="Project",
    title="Projects with the highest Star Growth",
)
df_top_40_growth = df_active.nlargest(40, "total_commits_last_year")
df_top_40_growth = df_top_40_growth[df_top_40_growth["project_name"] != "ElexonDataPortal"]
fig = px.bar(
    x=df_top_40_growth["total_commits_last_year"],
    y=df_top_40_growth["project_name"],
    orientation="h",
    color=df_top_40_growth["development_distribution_score"],
    hover_name=df_top_40_growth["git_url"],
    color_continuous_scale=color_continuous_scale,
)

fig.update_layout(
    height=1000,  # Added parameter
    xaxis_title="Commit Growth last Year [%]",
    yaxis_title="Project",
    title="Projects with the highest Commit Growth",
    coloraxis_colorbar=dict(
    title="Rubric",
    ),
)
df_total_score = df_active.nlargest(40, "total_score")

fig = px.bar(
    x=df_total_score["total_score"],
    y=df_total_score["project_name"],
    orientation="h",
    range_x=(0.85, 1),
    hover_name=df_total_score["git_url"],
    color = df_total_score["development_distribution_score"],
    color_continuous_scale=color_continuous_scale
)

fig.update_layout(
    height=1000,  # Added parameter
    xaxis_title="Total Score",
    yaxis_title="Project",
    title="Top Total Score",
    coloraxis_colorbar=dict(
    title="DDS",
    ),
)
fig.update(layout_showlegend=False)
df_activity_score = df_active.nlargest(40, "activity")

fig = px.bar(
    x=df_activity_score["activity"],
    y=df_activity_score["project_name"],
    orientation="h",
    range_x=(2.9, 3.2),
    hover_name=df_activity_score["git_url"],
    color=df_activity_score["development_distribution_score"],
    color_continuous_scale=color_continuous_scale
)

fig.update_layout(
    height=1000,  # Added parameter
    xaxis_title="Activity Score",
    yaxis_title="Project",
    title="Projects with the highest Activity Score",
    coloraxis_colorbar=dict(
    title="DDS",
    ),
)

fig.update(layout_showlegend=False)
df_size_score = df_active.nlargest(40, "size")

fig = px.bar(
    x=df_size_score["size"],
    y=df_size_score["project_name"],
    orientation="h",
    range_x=(3.75, 4),
    hover_name=df_size_score["git_url"],
    color=df_size_score["development_distribution_score"],
    color_continuous_scale=color_continuous_scale
)

fig.update_layout(
    height=1000,  # Added parameter
    xaxis_title="Size Score",
    yaxis_title="Project",
    title="Projects with the highest Size Score",
    coloraxis_colorbar=dict(
    title="DDS",
    ),
)

fig.update(layout_showlegend=False)
fig = px.scatter(
    df_active.query("project_age_in_years<@max_age_in_years"),
    x="project_age_in_years",
    y="rubric",
    size="size",
    color="total_score",
    hover_name="git_url",
    size_max=20,
)

fig.update_layout(
    coloraxis_colorbar=dict(title="Total Score"),
    height=1000,  # Added parameter
    xaxis_title="Project Age in Years",
    yaxis_title="Rubric",
    title="Total Score of Projects",
)


fig.show()
rubric_his = (
    df_active["rubric"].value_counts().to_frame().rename_axis("rubric_name").reset_index()
)


fig = px.treemap(rubric_his, path=["rubric_name"], values="rubric", color="rubric")

fig.update_layout(coloraxis_showscale=False)
fig.update_layout(
    autosize=False,
    paper_bgcolor="lightgray",
    height=700,  # Added parameter
    width=2100,
    uniformtext=dict(minsize=15, mode="show"),
    margin=dict(t=0, l=0, r=0, b=0),
)
fig.show()
fig = px.scatter(
    df_organization_projects.query("project_age_in_years<@max_age_in_years"),
    x="project_age_in_years",
    y="rubric",
    size="size",
    color="development_distribution_score",
    hover_name="git_url",
    size_max=20,
    color_continuous_scale=color_continuous_scale.reverse(),
)

fig.update_layout(
    coloraxis_colorbar=dict(
        title="DDS",
    ),
    yaxis_title="Rubric",
    xaxis_title="Project Age in Years",
    height=1000,  # Added parameter
    title="Development Distribution Score",
)
fig.show()
personal_stargazers = df_personal_projects.nlargest(40, "stargazers_count")

fig = px.bar(
    x=personal_stargazers["stargazers_count"],
    y=personal_stargazers["git_namespace"],
    orientation="h",
    hover_name=personal_stargazers["git_url"],
    color=personal_stargazers["development_distribution_score"],
    color_continuous_scale=color_continuous_scale
)

fig.update_layout(
    height=1000,  # Added parameter
    yaxis_title="Rubric",
    xaxis_title="Stars",
    title="Projects with most Stars in User Namespace",
    coloraxis_colorbar=dict(
    title="DDS",
    ),
)


fig.update(layout_showlegend=False)
df_active["dependents_count"] = df_active["dependents_repos"].apply(count_strings)

most_dependent_projects = df_active.nlargest(50, "dependents_count")
most_dependent_projects = most_dependent_projects[most_dependent_projects["project_name"] != "Mission Support System"]
print("DDS of most used Python project:",round(most_dependent_projects["development_distribution_score"].median(),3))


fig = px.bar(
    x=most_dependent_projects["dependents_count"],
    y=most_dependent_projects["project_name"],
    orientation="h",
    hover_name=most_dependent_projects["git_url"],
    color=most_dependent_projects["development_distribution_score"],
    color_continuous_scale=color_continuous_scale
)

fig.update_layout(
    height=1000,  # Added parameter
    yaxis_title="Rubric",
    xaxis_title="Dependents",
    title="Most used Python Projects vs. DDS",
    coloraxis_colorbar=dict(
    title="DDS",
    ),
)
DDS of most used Python project: 0.436

Process the organizations#

df_organizations = pd.read_csv("./csv/github_organizations.csv")
df_organizations.head()
organization_name organization_user_name organization_github_url organization_website location_city location_country form_of_organization organization_avatar organization_public_repos organization_created organization_last_update rubric
0 NaN AgroCares https://github.com/AgroCares https://grasplan.nl/ NaN Netherlands community https://avatars.githubusercontent.com/u/316846... 6 2017-09-06 06:22 2021-11-16 11:18 NaN
1 DSMR-reader dsmrreader https://github.com/dsmrreader https://dsmr-reader.readthedocs.io NaN Netherlands community https://avatars.githubusercontent.com/u/577273... 1 2019-11-13 19:08 2021-11-14 20:43 NaN
2 STS Rosario STS-Rosario https://github.com/STS-Rosario http://www.stsrosario.org.ar/index.html NaN Argentina community https://avatars.githubusercontent.com/u/244938... 2 2016-12-10 14:07 2021-11-03 21:52 NaN
3 Open Solar Project opensolarproject https://github.com/opensolarproject NaN NaN Australia for-profit https://avatars.githubusercontent.com/u/539539... 2 2019-08-09 20:31 2021-11-14 16:10 NaN
4 Open Food Foundation openfoodfoundation https://github.com/openfoodfoundation https://www.openfoodnetwork.org/open-food-foun... Melbourne Australia non-profit https://avatars.githubusercontent.com/u/257898... 53 2012-10-17 07:53 2021-11-19 05:14 NaN
df_organizations["ISO_3"] = df_organizations["location_country"].apply(name_to_iso3)
df_organizations["ISO_3_alpha2"] = df_organizations["ISO_3"].apply(alpha3_to_alpha2)
df_organizations["continent"] = df_organizations["ISO_3_alpha2"].apply(alpha2_to_continent)
continent_his = df_organizations["continent"].value_counts().to_frame().rename_axis("continent_name")
continent_his.rename(index={"EU": "Europe", "NA": "North America", "": "Global", "OC":"Oceania", "AS":"Asia", "SA":"South America", "AF":"Africa"},inplace=True)

print(continent_his)
fig = px.pie(continent_his.reset_index(), values="continent", names="continent_name", color_discrete_sequence=color_discrete_sequence, hole=0.2)

fig.update_layout(title="Distribution of Organizations between Continents", font_size=16)
fig.update_traces(textposition='outside', textinfo='value+label', marker=dict(line=dict(color='#000000', width=2)))
fig.show()
                continent
continent_name           
Europe                203
North America         191
Global                176
Oceania                19
Asia                   12
South America           6
Africa                  4
## https://octoverse.github.com/
values = {31.5,31.2,27.3,5.9,2.3,1.7}
index_labels=['Oceania','Africa','South America','Europe','Asia','North America']
df_users_continent_cotoverse = pd.DataFrame(values,index=index_labels).reset_index()
fig = px.pie(df_users_continent_cotoverse, values=0, names="index", color_discrete_sequence=color_discrete_sequence, hole=0.2)

fig.update_layout(title="Distribution of Users between Continents", font_size=16)
fig.update_traces(textposition='outside', textinfo='value+label', marker=dict(line=dict(color='#000000', width=2)))
fig.show()
organization_his = (
    df_organizations["form_of_organization"]
    .value_counts()
    .to_frame()
    .rename_axis("organization")
    .reset_index()
)

organization_his["organization"] = organization_his["organization"].apply(upper_string)
print(organization_his)
fig = px.pie(organization_his, values="form_of_organization", names="organization", color_discrete_sequence=color_discrete_sequence, hole=0.2)

fig.update_layout(title="Distribution of Organizational Forms", font_size=16)
fig.update_traces(textposition='outside', textinfo='percent+label', marker=dict(line=dict(color='#000000', width=2)))
fig.show()
        organization  form_of_organization
0          Community                   160
1           Academia                   144
2  Government Agency                    99
3         For-Profit                    85
4         Non-Profit                    65
5      Collaboration                    58
df_countries = (
    df_organizations["ISO_3"]
    .value_counts()
    .to_frame()
    .rename_axis("country")
    .reset_index()
)
df_countries = df_countries.rename(columns={"ISO_3": "counts"})

fig = px.choropleth(
    df_countries,
    locations="country",
    locationmode="ISO-3",
    color="counts",
    color_continuous_scale=color_continuous_scale
)

fig.update_layout(title="Distribution of Organizational Locations Worldwide",
                    coloraxis_colorbar=dict(
                    title="Organizations",
                    ),)

fig.show()
df_public_repos = df_organizations.nlargest(40, "organization_public_repos")

df_public_repos.head()
organization_name organization_user_name organization_github_url organization_website location_city location_country form_of_organization organization_avatar organization_public_repos organization_created organization_last_update rubric ISO_3 ISO_3_alpha2 continent
298 Microsoft microsoft https://github.com/microsoft https://opensource.microsoft.com Redmond, WA USA for-profit https://avatars.githubusercontent.com/u/615472... 4485 2013-12-10 19:06 2021-11-20 00:29 NaN USA US NA
307 International Business Machines IBM https://github.com/IBM https://www.ibm.com/opensource/ Armonk, NY USA for-profit https://avatars.githubusercontent.com/u/145911... 2278 2012-02-21 22:13 2021-11-19 23:15 NaN USA US NA
321 The Apache Software Foundation apache https://github.com/apache https://www.apache.org/ NaN USA non-profit https://avatars.githubusercontent.com/u/47359?v=4 2275 2009-01-17 20:14 2021-11-20 00:34 NaN USA US NA
296 Google google https://github.com/google https://opensource.google/ Mountain View, CA USA for-profit https://avatars.githubusercontent.com/u/134200... 2139 2012-01-18 01:30 2021-11-19 22:27 NaN USA US NA
292 Microsoft Azure Azure https://github.com/Azure https://docs.microsoft.com/en-us/azure/ Redmond, WA USA for-profit https://avatars.githubusercontent.com/u/684449... 1618 2014-03-03 22:17 2021-11-19 23:35 NaN USA US NA
df_organizations["organizations_age_in_years"] = df_organizations[
    "organization_created"
].apply(calc_age)
fig = px.scatter(
    df_organizations.query("organizations_age_in_years<@max_age_in_years"),
    x="organizations_age_in_years",
    y="location_country",
    size="organization_public_repos",
    color="form_of_organization",
    hover_name="organization_website",
    size_max=20,
    color_continuous_scale=color_continuous_scale,
)

fig.update_layout(
    coloraxis_colorbar=dict(
        title="DDS",
    ),
    yaxis_title="Rubric",
    xaxis_title="Project Age in Years",
    height=1000,  # Added parameter
    title="Organizations forms within different countries",
)
fig.show()

Not included Projects#

Within the first version of this study we were not able to integrate a GitLab API interfaces. Also other projects on self-hosted repositories and other colloboaritve website could not be included in the study. Another group that was not included in the study are the inactive projects. Here we try to give an insight into these projects.

df_raw[(df_raw["platform"] == "gitlab")]
project_name oneliner git_namespace git_url platform topics rubric last_commit_date stargazers_count number_of_dependents ... organization_github_url organization_website organization_location organization_country organization_form organization_avatar organization_public_repos organization_created organization_last_update project_age_in_years
136 emobpy An open tool for creating battery-electric veh... diw-evu/emobpy https://gitlab.com/diw-evu/emobpy/emobpy gitlab NaN Battery NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
190 dieter_py An open source power sector optimization model... diw-evu/dieter_public https://gitlab.com/diw-evu/dieter_public/dieterpy gitlab NaN Energy Modeling and Optimization NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
279 pyehub A Python-based, modular and nestable implement... energyincities https://gitlab.com/energyincities/python-ehub gitlab NaN Energy Distribution and Grids NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
286 mosaik A flexible Smart Grid co-simulation framework. mosaik https://gitlab.com/mosaik/mosaik gitlab NaN Energy Distribution and Grids NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
287 SmartGridToolbox Designed to provide an extensible and flexible... SmartGridToolbox https://gitlab.com/SmartGridToolbox/SmartGridT... gitlab NaN Energy Distribution and Grids NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
326 KoaVTracker Energy targets in the coalition agreement of t... diw-evu https://gitlab.com/diw-evu/koavtracker gitlab NaN Datasets on Energy Systems NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
359 Energy Signature Analyser A toolbox to analyze energy signatures of buil... energyincities https://gitlab.com/energyincities/energy-signa... gitlab NaN Buildings and Heating NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
367 BESOS A collection of modules for the simulation and... energyincities https://gitlab.com/energyincities/besos gitlab NaN Buildings and Heating NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
384 Macquette A whole house energy assessment tool, which mo... retrofitcoop https://gitlab.com/retrofitcoop/macquette gitlab NaN Buildings and Heating NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
429 sustainable-mobility-api Consists of a Python library and HTTP API for ... mshepherd https://gitlab.com/mshepherd/sustainable-mobil... gitlab NaN Mobility and Transportation NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
462 H2020 CATALYST Converting data centres in energy flexibility ... NaN https://gitlab.com/project-catalyst gitlab NaN Computation and Communication NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
469 ecometer Loads websites, compute metrics (from network ... ecoconceptionweb https://gitlab.com/ecoconceptionweb/ecometer gitlab NaN Computation and Communication NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
478 Carbon-API-2.0 Estimating the carbon emissions per page on th... wholegrain https://gitlab.com/wholegrain/carbon-api-2-0 gitlab NaN Computation and Communication NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
504 CarbonFootprint A browser extension that displays carbon footp... aossie https://gitlab.com/aossie/CarbonFootprint gitlab NaN Carbon Intensity and Accounting NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
519 OpenIAM An open source integrated assessment model dev... NRAP https://gitlab.com/NRAP/OpenIAM gitlab NaN Carbon Capture and Removel NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
530 vein An R package to estimate Vehicular Emissions I... ibarraespinosa https://gitlab.com/ibarraespinosa/vein gitlab NaN Emission Observation and Modeling NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
623 OpenSimRoot Source code for simulating root architecture, ... rootmodels https://gitlab.com/rootmodels/OpenSimRoot gitlab NaN Biosphere NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
684 EU forest tree point data A compilation of analysis-ready point data for... openlandmap https://gitlab.com/openlandmap/eu-forest-tree-... gitlab NaN Biosphere NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
723 GlaThiDa Glacier Thickness Database. wgms https://gitlab.com/wgms/glathida gitlab NaN Cryosphere NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
820 met.3D Interactive three-dimensional visualization of... wxmetvis https://gitlab.com/wxmetvis/met.3d gitlab NaN Atmosphere NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
838 The Global Environmental Multiscale Model An integrated forecasting and data assimilatio... eccc/gem https://gitlab.com/eccc/gem/gem gitlab NaN Earth and Climate Modeling NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1014 Imod-Python Designed to help you in your MODFLOW groundwat... deltares/imod https://gitlab.com/deltares/imod/imod-python gitlab NaN Water Supply NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1026 RTC-Tools A toolbox for control and optimization of wate... deltares https://gitlab.com/deltares/rtc-tools gitlab NaN Water Supply NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1057 WWTP EU28 Waste Water Treatment Plants. hotmaps/potential https://gitlab.com/hotmaps/potential/WWTP gitlab NaN Water Supply NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1106 OpenLandMap Data, services and web-apps providing access a... NaN https://gitlab.com/openlandmap gitlab NaN Soil and Land NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1200 XDC Model Enable users, or any intereted subject, to und... xdc-model https://gitlab.com/xdc-model/xdc gitlab NaN Sustainable Investment NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1263 Eumap Comprises environmental, land cover, terrain, ... geoharmonizer_inea https://gitlab.com/geoharmonizer_inea/eumap gitlab NaN Data Catalogs and Interfaces NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

27 rows × 52 columns

df_inactive = df_raw[(df_raw["project_active"] == False)].copy()

# Age plots are better in years
df_inactive["project_age_in_years"] = df_inactive["project_age_in_days"].apply(lambda x: x / 365)

fig = px.scatter(
    df_inactive,
    x="project_age_in_years",
    y="rubric",
    size="contributors",
    color="development_distribution_score",
    hover_name="git_url",
    size_max=20,
    color_continuous_scale=color_continuous_scale.reverse(),
)

fig.update_layout(
    coloraxis_colorbar=dict(
        title="DDS",
    ),
    paper_bgcolor="lightgray",
    height=1000,  # Added parameter
    yaxis_title="Rubric",
    xaxis_title="Project Age in years",
    title="Development Distribution Score within inactive Projects",
)

fig.show()